{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据形状: (614872, 10)\n",
      "列名: ['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code', 'valid_start_date', 'valid_end_date', 'invalid_reason']\n",
      "\n",
      "随机5行数据:\n",
      "        concept_id                            concept_name    domain_id  \\\n",
      "19940     37160018  Bilateral acute contact otitis externa    Condition   \n",
      "204533    42009502                                  北31条東1    Geography   \n",
      "335528     4169540                 Allergy to promethazine  Observation   \n",
      "418675    45761019                     Spinal hernia truss       Device   \n",
      "188121    41887924                                 Pomeroy    Geography   \n",
      "\n",
      "       vocabulary_id  concept_class_id standard_concept      concept_code  \\\n",
      "19940         SNOMED          Disorder                S  1082891000119109   \n",
      "204533           OSM        10th level                S           7165392   \n",
      "335528        SNOMED  Clinical Finding              NaN         294126003   \n",
      "418675        SNOMED   Physical Object                S         466961002   \n",
      "188121           OSM        10th level                S           6074615   \n",
      "\n",
      "        valid_start_date  valid_end_date  invalid_reason  \n",
      "19940           20230630        20991231             NaN  \n",
      "204533          19700101        20991231             NaN  \n",
      "335528          20020131        20991231             NaN  \n",
      "418675          20140731        20991231             NaN  \n",
      "188121          19700101        20991231             NaN  \n"
     ]
    }
   ],
   "source": [
    "# display the snomed file\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('/home/huangj2/Documents/RAG_P2_医疗名词/01.standardization/data/SNOMED-CT/SNOMED_valid_comma.csv',\n",
    "                low_memory=False\n",
    "                )\n",
    "\n",
    "# 显示数据的基本信息\n",
    "print(\"数据形状:\", df.shape)\n",
    "print(\"列名:\", df.columns.tolist())\n",
    "\n",
    "# 随机展示5行数据的完整内容\n",
    "print(\"\\n随机5行数据:\")\n",
    "pd.set_option('display.max_columns', None)  # 显示所有列\n",
    "pd.set_option('display.width', None)  # 显示所有内容不截断\n",
    "pd.set_option('display.max_colwidth', None)  # 显示每列的完整内容\n",
    "print(df.sample(5))  # 使用sample方法随机抽取5行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "数据类型和非空值统计:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 614872 entries, 0 to 614871\n",
      "Data columns (total 10 columns):\n",
      " #   Column            Non-Null Count   Dtype  \n",
      "---  ------            --------------   -----  \n",
      " 0   concept_id        614872 non-null  int64  \n",
      " 1   concept_name      614870 non-null  object \n",
      " 2   domain_id         614872 non-null  object \n",
      " 3   vocabulary_id     614871 non-null  object \n",
      " 4   concept_class_id  614872 non-null  object \n",
      " 5   standard_concept  547516 non-null  object \n",
      " 6   concept_code      614872 non-null  object \n",
      " 7   valid_start_date  614872 non-null  int64  \n",
      " 8   valid_end_date    614872 non-null  int64  \n",
      " 9   invalid_reason    0 non-null       float64\n",
      "dtypes: float64(1), int64(3), object(6)\n",
      "memory usage: 46.9+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# 显示每列的数据类型和非空值数量\n",
    "print(\"\\n数据类型和非空值统计:\")\n",
    "print(df.info())\n",
    "\n",
    "# 显示数值列的统计摘要\n",
    "# print(\"\\n数值列统计摘要:\")\n",
    "# print(df.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "包含'Chronic back pain的概念:\n",
      "找到 2 条包含'backpain'的记录\n",
      "        concept_code                                          concept_name  \\\n",
      "269506     134407002                                     Chronic back pain   \n",
      "434551  631000119102  Chronic back pain greater than three months duration   \n",
      "\n",
      "        domain_id  concept_class_id  \n",
      "269506  Condition  Clinical Finding  \n",
      "434551  Condition  Clinical Finding  \n"
     ]
    }
   ],
   "source": [
    "# 检索包含\"Headace\"的概念名称\n",
    "print(\"\\n包含'Chronic back pain的概念:\")\n",
    "backpain_concepts = df[df['concept_name'].str.contains('Chronic back pain', case=False, na=False)]\n",
    "print(f\"找到 {len(backpain_concepts)} 条包含'backpain'的记录\")\n",
    "print(backpain_concepts[['concept_code', 'concept_name', 'domain_id', 'concept_class_id']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "domain_id 和 concept_class_id 的列表:\n",
      "['Condition' 'Metadata' 'Condition Status' 'Type Concept' 'Cost' 'Visit'\n",
      " 'Unit' 'Episode' 'Revenue Code' 'Language' 'Relationship' 'Observation'\n",
      " 'Procedure' 'Measurement' 'Drug' 'Spec Anatomic Site' 'Specimen'\n",
      " 'Meas Value' 'Route' 'Device' 'Geography' 'Race' 'Provider' 'Plan'\n",
      " 'Plan Stop Reason' 'Meas Value Operator' 'Gender' 'Payer' 'Sponsor']\n",
      "['Clinical Finding' 'Disorder' 'Field' 'Table' 'CDM' 'Concept Class'\n",
      " 'Condition Status' 'Death Type' 'Condition Type' 'Detail' 'Summary'\n",
      " 'Cost Type' 'Device Type' 'Domain' 'Drug Type' 'Vocabulary' 'Visit'\n",
      " 'Unit' 'Episode of Care' 'Treatment' 'Disease Extent' 'Disease Dynamic'\n",
      " 'Revenue Code' 'Language' 'Relationship' 'Procedure' 'Observable Entity'\n",
      " 'Substance' 'Location' 'Body Structure' 'Specimen' 'Qualifier Value'\n",
      " 'Physical Object' 'Organism' 'Pharma/Biol Product' 'Model Comp'\n",
      " 'Social Context' 'Morph Abnormality' 'Dose Form' 'Context-dependent'\n",
      " 'Clinical Drug' 'Event' 'Clinical Drug Form' 'Disposition'\n",
      " 'Staging / Scales' 'Record Artifact' 'Namespace Concept' 'Physical Force'\n",
      " 'Attribute' 'Meas Type' 'Metadata' 'Undefined' 'Note Type' '9th level'\n",
      " '8th level' '10th level' '6th level' '7th level' '4th level' '11th level'\n",
      " '5th level' '3rd level' '2nd level' '12th level' 'Obs Period Type'\n",
      " 'Observation Type' 'Benefit' 'Metal level' 'Plan Stop Reason'\n",
      " 'Procedure Type' 'Navi Concept' 'Special Concept' 'Payer' 'Sponsor'\n",
      " 'Type Concept' 'UB04 Point of Origin' 'UB04 Pri Typ of Adm'\n",
      " 'UB04 Pt dis status' 'Observation' 'Canonical Unit' 'US Census Region'\n",
      " 'US Census Division' 'Visit Type' 'Linkage Concept' 'Linkage Assertion']\n"
     ]
    }
   ],
   "source": [
    "# domain_id 和 concept_class_id - 展示一下列表\n",
    "print(\"\\ndomain_id 和 concept_class_id 的列表:\")\n",
    "print(df['domain_id'].unique())\n",
    "print(df['concept_class_id'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "包含'Dyspnea'的概念:\n",
      "找到 24 条包含'Dyspnea'的记录\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "\"['Synonyms'] not in index\"",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m      3\u001b[0m dyspnea_concepts \u001b[38;5;241m=\u001b[39m df[df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mconcept_name\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDyspnea\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, na\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)]\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m找到 \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(dyspnea_concepts)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m 条包含\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDyspnea\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m的记录\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdyspnea_concepts\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mconcept_code\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mconcept_name\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSynonyms\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m)\n",
      "File \u001b[0;32m/mnt/external_disk/venv/20250203_SimplePython/lib/python3.10/site-packages/pandas/core/frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   4106\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m   4107\u001b[0m         key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4108\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m   4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
      "File \u001b[0;32m/mnt/external_disk/venv/20250203_SimplePython/lib/python3.10/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m   6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   6198\u001b[0m     keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m   6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m   6204\u001b[0m     \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "File \u001b[0;32m/mnt/external_disk/venv/20250203_SimplePython/lib/python3.10/site-packages/pandas/core/indexes/base.py:6252\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m   6249\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mKeyError\u001b[0m: \"['Synonyms'] not in index\""
     ]
    }
   ],
   "source": [
    "# 检索包含\"Dyspnea\"的概念名称\n",
    "print(\"\\n包含'Dyspnea'的概念:\")\n",
    "dyspnea_concepts = df[df['concept_name'].str.contains('Dyspnea', case=False, na=False)]\n",
    "print(f\"找到 {len(dyspnea_concepts)} 条包含'Dyspnea'的记录\")\n",
    "print(dyspnea_concepts[['concept_code', 'concept_name', 'Synonyms']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 321341 - 拿这个索引号， 第321341行\n",
    "print(\"\\n检索第321341行的数据:\")\n",
    "print(df.iloc[321341])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 尝试使用模糊匹配\n",
    "print(\"\\n尝试模糊匹配:\")\n",
    "fuzzy_matches = df[df['concept_code'].str.contains('267036', na=False)]\n",
    "if len(fuzzy_matches) > 0:\n",
    "    print(\"找到的模糊匹配:\")\n",
    "    print(fuzzy_matches[['concept_code', 'concept_name']])\n",
    "\n",
    "# 检查Dyspnea相关的行\n",
    "print(\"\\n检查Dyspnea相关的行:\")\n",
    "dyspnea_rows = df[df['concept_name'].str.contains('Dyspnea', case=False, na=False)]\n",
    "print(dyspnea_rows[['concept_code', 'concept_name']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (20250203_SimplePython)",
   "language": "python",
   "name": "20250203_simplepython"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
